/**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5

//void PostFuncBiasReluC4Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc4div, size_t oc4mod,
//                            size_t plane_size, size_t plane_stride, size_t relu_type);
// x0 dst           x1 srx           x2 bias
// w3 oc4div        w4 oc4mod        w5 plane_size
// x6 plane_stride  x7 relu_type

asm_function PostFuncBiasReluC4Fp16

  movi v26.4h, #6
  scvtf v26.4h, v26.4h
  dup v27.4h, wzr

  mov x10, #2
  add x12, x3, x4
  mul x12, x12, x10

  mov w10, #0

Loop_C4:
  cmp w10, w3
  beq Loop_C1
  mov x15, #2
  mul x14, x10, x15
  add x15, x0, x14
  add w10, w10, #4
  mov w13, w5
  ld1 {v16.4h}, [x2], #8

Loop_8x4:
  cmp w13, #8
  blt Loop_4x4
  sub w13, w13, #8
  ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1], #32
  ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32

  fadd v0.4h, v0.4h, v16.4h
  fadd v1.4h, v1.4h, v16.4h
  fadd v2.4h, v2.4h, v16.4h
  fadd v3.4h, v3.4h, v16.4h
  fadd v4.4h, v4.4h, v16.4h
  fadd v5.4h, v5.4h, v16.4h
  fadd v6.4h, v6.4h, v16.4h
  fadd v7.4h, v7.4h, v16.4h

  cmp x7, #3
  beq Relu6_8x4
  cmp x7, #1
  beq Relu_8x4
  b Write_8x4
Relu6_8x4:
  fmin v0.4h, v0.4h, v26.4h
  fmin v1.4h, v1.4h, v26.4h
  fmin v2.4h, v2.4h, v26.4h
  fmin v3.4h, v3.4h, v26.4h
  fmin v4.4h, v4.4h, v26.4h
  fmin v5.4h, v5.4h, v26.4h
  fmin v6.4h, v6.4h, v26.4h
  fmin v7.4h, v7.4h, v26.4h
Relu_8x4:
  fmax v0.4h, v0.4h, v27.4h
  fmax v1.4h, v1.4h, v27.4h
  fmax v2.4h, v2.4h, v27.4h
  fmax v3.4h, v3.4h, v27.4h
  fmax v4.4h, v4.4h, v27.4h
  fmax v5.4h, v5.4h, v27.4h
  fmax v6.4h, v6.4h, v27.4h
  fmax v7.4h, v7.4h, v27.4h
Write_8x4:
  st1 {v0.4h}, [x15], x12
  st1 {v1.4h}, [x15], x12
  st1 {v2.4h}, [x15], x12
  st1 {v3.4h}, [x15], x12
  st1 {v4.4h}, [x15], x12
  st1 {v5.4h}, [x15], x12
  st1 {v6.4h}, [x15], x12
  st1 {v7.4h}, [x15], x12
  b Loop_8x4

Loop_4x4:
  cmp w13, #4
  blt Loop_1x4
  sub w13, w13, #4
  ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1], #32
  fadd v0.4h, v0.4h, v16.4h
  fadd v1.4h, v1.4h, v16.4h
  fadd v2.4h, v2.4h, v16.4h
  fadd v3.4h, v3.4h, v16.4h
  cmp x7, #3
  beq Relu6_4x4
  cmp x7, #1
  beq Relu_4x4
  b Write_4x4
Relu6_4x4:
  fmin v0.4h, v0.4h, v26.4h
  fmin v1.4h, v1.4h, v26.4h
  fmin v2.4h, v2.4h, v26.4h
  fmin v3.4h, v3.4h, v26.4h
Relu_4x4:
  fmax v0.4h, v0.4h, v27.4h
  fmax v1.4h, v1.4h, v27.4h
  fmax v2.4h, v2.4h, v27.4h
  fmax v3.4h, v3.4h, v27.4h
Write_4x4:
  st1 {v0.4h}, [x15], x12
  st1 {v1.4h}, [x15], x12
  st1 {v2.4h}, [x15], x12
  st1 {v3.4h}, [x15], x12

Loop_1x4:
  cmp x7, #3
  beq Relu6_1x4
  cmp x7, #1
  beq Relu_1x4
  b Write_1x4
Relu6_1x4:
  cmp w13, #0
  beq HW_Add
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmin v0.4h, v0.4h, v26.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.4h}, [x15], x12
  b Relu6_1x4
Relu_1x4:
  cmp w13, #0
  beq HW_Add
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.4h}, [x15], x12
  b Relu_1x4
Write_1x4:
  cmp w13, #0
  beq HW_Add
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  st1 {v0.4h}, [x15], x12
  b Write_1x4

HW_Add:
  add x1, x1, x6
  b Loop_C4

Loop_C1:
  cmp w4, #0
  beq End
  mov w13, w5
  ld1 {v16.4h}, [x2], #8
  mov x15,  #2
  mul x14, x10, x15
  add x0, x0, x14

  cmp w4, #1
  beq Loop_C1_1
  cmp w4, #2
  beq Loop_C1_2
  cmp w4, #3
  beq Loop_C1_3

Loop_C1_1:
  cmp x7, #3
  beq Loop_C1_1_Relu6
  cmp x7, #1
  beq Loop_C1_1_Relu
  b Loop_C1_1_Write
Loop_C1_1_Relu6:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmin v0.4h, v0.4h, v26.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.h}[0], [x0], x12
  b Loop_C1_1_Relu6
Loop_C1_1_Relu:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.h}[0], [x0], x12
  b Loop_C1_1_Relu
Loop_C1_1_Write:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  st1 {v0.h}[0], [x0], x12
  b Loop_C1_1_Write

Loop_C1_2:
  cmp x7, #3
  beq Loop_C1_2_Relu6
  cmp x7, #1
  beq Loop_C1_2_Relu
  b Loop_C1_2_Write
Loop_C1_2_Relu6:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmin v0.4h, v0.4h, v26.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.s}[0], [x0], x12
  b Loop_C1_2_Relu6
Loop_C1_2_Relu:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.s}[0], [x0], x12
  b Loop_C1_2_Relu
Loop_C1_2_Write:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  st1 {v0.s}[0], [x0], x12
  b Loop_C1_2_Write

Loop_C1_3:
  add x15, x0, #4
  cmp x7, #3
  beq Loop_C1_3_Relu6
  cmp x7, #1
  beq Loop_C1_3_Relu
  b Loop_C1_3_Write
Loop_C1_3_Relu6:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmin v0.4h, v0.4h, v26.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.s}[0], [x0], x12
  st1 {v0.h}[2], [x15], x12
  b Loop_C1_3_Relu6
Loop_C1_3_Relu:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  fmax v0.4h, v0.4h, v27.4h
  st1 {v0.s}[0], [x0], x12
  st1 {v0.h}[2], [x15], x12
  b Loop_C1_3_Relu
Loop_C1_3_Write:
  cmp w13, #0
  beq End
  sub w13, w13, #1
  ld1 {v0.4h}, [x1], #8
  fadd v0.4h, v0.4h, v16.4h
  st1 {v0.s}[0], [x0], x12
  st1 {v0.h}[2], [x15], x12
  b Loop_C1_3_Write

End:
  ret

#endif
